library(TSA)
library(hts)
library(tidyverse)
library(lubridate)
library(prophet)
library(ggplot2)
library(plotly)
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2284636 122.1 4397135 234.9 NA 4321059 230.8
## Vcells 3902263 29.8 8388608 64.0 16384 6005348 45.9
Import Data
datapath = "/Volumes/SSD/TS-Project/data"
train = read.csv(file=paste(datapath,"subset/train_city_type_item.csv",sep="/"))
SMAPE Function
smape = function(y_pred, y_actual) {
error = mean(abs(y_pred - y_actual)/(abs(y_actual) + abs(y_pred)))
return(error)
}
Predict Without Holdays
# Adding total sales column to data and prepare for prophet
train.prophet <- train_wide[,-2]
Tot_Sales <- rowSums(train.prophet[,-c(1,2)])
holidays <- train.prophet[,c(1,2)]
train.prophet <- cbind(train.prophet[,1],Tot_Sales)
# Split into training and validation (use last 20 days as validation data)
n <- nrow(train.prophet)
train.valid <- train.prophet[(n-19):n, ]
colnames(train.valid) <- c("date","y")
train.valid$date <- as.Date(train.valid$date)
train.real <- train.prophet[1:(n-20), ]
# Preparing for prophet
train.real$date <- as.Date(train.real$date)
colnames(train.real) <- c("ds","y")
# Running prophet with no parameters set
train.pro <- prophet(train.real)
future <- make_future_dataframe(train.pro, periods = 20)
result_pro <- predict(train.pro,future)
# plot to see how well the model predicted validation data
# red is predicted; black is actual
plot1 <- ggplot()+
geom_line(data = result_pro %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
scale_x_date(labels = scales::date_format("%b %d"),
breaks = scales::date_breaks("1 day"))+
theme_minimal()+
theme(
axis.text.x = element_text(angle = 45),
legend.position = "bottom"
)
ggplotly(plot1 , dynamicTicks = TRUE)
# Making the model better
# 1. Taking holiday into account; I already constructed a holidays table earlier
colnames(holidays) <- c("ds","holiday")
holidays$holiday <- as.character(holidays$holiday)
train.pro.new <- prophet(train.real,holidays = holidays)
future <- make_future_dataframe(train.pro.new, periods = 20)
result.pro.new <- predict(train.pro.new,future)
# plot to see if the model improved
# black is actual; green is old; red is new
plot2 <- ggplot()+
geom_line(data = result.pro.new %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = result_pro %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "green")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
scale_x_date(labels = scales::date_format("%b %d"),
breaks = scales::date_breaks("1 day"))+
theme_minimal()+
theme(
axis.text.x = element_text(angle = 45),
legend.position = "bottom"
)
ggplotly(plot2 , dynamicTicks = TRUE)
# The model did worse, so I'm going to remove holiday data from the model
# 2. Changing seasonality.mode to multiplicative
train.pro.new2 <- prophet(train.real, seasonality.mode = "multiplicative")
future <- make_future_dataframe(train.pro.new2, periods = 20)
result.pro.new2 <- predict(train.pro.new2,future)
# plot to see if the model improved
# black is actual; green is old; red is new
plot2 <- ggplot()+
geom_line(data = result.pro.new2 %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = result.pro.new %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "green")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
scale_x_date(labels = scales::date_format("%b %d"),
breaks = scales::date_breaks("1 day"))+
theme_minimal()+
theme(
axis.text.x = element_text(angle = 45),
legend.position = "bottom"
)
ggplotly(plot2 , dynamicTicks = TRUE)
# 2. Change point scale
# reference: https://www.kaggle.com/holdenyau/prophet/code
result <- data.frame(scale = 1:10 , traing_RMSE = 1: 10 , valid_RMSE = 1:10)
for (i in 1:10){
train.pro.opt <- prophet(train.real, changepoint.prior.scale = i/100)
future <- make_future_dataframe(train.pro.opt, periods = 20)
result.pro.opt <- predict( train.pro.opt , future)
v_traing <- accuracy(result.pro.opt$yhat[1:1659] , train.real$y )[2]
v_valid <- accuracy(result.pro.opt$yhat[1660:1679] , train.valid$y )[2]
result$scale[i] <- i/100
result$traing_RMSE[i] <- v_traing
result$valid_RMSE[i] <- v_valid
}
result %>%
gather(`traing_RMSE` , `valid_RMSE` , key = "type" , value = "RMSE") %>%
ggplot()+
geom_point(aes(x = scale , y = RMSE , col = type))+
geom_line( aes(x = scale , y = RMSE , group = type))+
theme_minimal()+
theme(legend.position = "bottom")

plot3 <- ggplot()+
geom_line(data = result.pro.opt %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = result_pro %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "green")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
scale_x_date(labels = scales::date_format("%b %d"),
breaks = scales::date_breaks("1 day"))+
theme_minimal()+
theme(
axis.text.x = element_text(angle = 45),
legend.position = "bottom"
)
ggplotly(plot3 , dynamicTicks = TRUE)
Smape of validation data
smape(train.valid$y,result_pro$yhat %>% tail(20))
## [1] 0.03945145
smape(train.valid$y,result.pro.opt$yhat %>% tail(20))
## [1] 0.03955245
# Preparing proportion table for splitting forecast data to sub time series
train.proportion <- train_wide[,-c(2,3)]
Tot_Sales <- rowSums(train.proportion[,-1])
train.proportion <- cbind(train.proportion, Tot_Sales)
for (i in 2:47){
for (j in 1:1659){
train.proportion[j,i] <- train.proportion[j,i]/train.proportion[j,47]
}
}
# Calculating average proportion
train.proportion <- train.proportion[1:1659,]
avg.proportion <- colSums(train.proportion[,-1]/1659)
# Assigning forecasted value based on average and forecasted total sales
# We will only be using the last 20 rows of train.proportion later
for (i in 2:47){
for (j in 1:1679){
train.proportion[j,i] <- result_pro$yhat[j]*avg.proportion[i-1]
}
}
smape.values <- data.frame()
# Calculating smape for each column
for (i in 4:48){
smape.values <- rbind(smape.values, smape(as.numeric(unlist((train_wide[,i] %>% tail(20)))),train.proportion[,(i-2)] %>% tail(20)))
}
# Taking the average
smape.final <- colSums(smape.values)/45; smape.final
## X0.0888549625165144
## 0.1524263
Cross-Validation Function
prophet.CV = function(y, window, h) {
# y is a dataset with date and sales - prepped for prophet
cv.error = matrix(ncol = 1)
for (i in seq(1, nrow(y) - (window + h) + 1)) {
# Setting the window
train_start = i
train_end = i + window - 1
test_start = i + window
test_end = i + window + h - 1
# Test set
train.valid <- y[test_start:test_end, ]
colnames(train.valid) <- c("date","y")
train.valid$date <- as.Date(train.valid$date)
# Train set
train.real <- y[train_start:train_end, ]
train.real$date <- as.Date(train.real$date)
colnames(train.real) <- c("ds","y")
# Predict
train.pro <- prophet(train.real)
future <- make_future_dataframe(train.pro, periods = h)
fc = predict(train.pro,future)
print(paste("Done", as.character(i)))
pred = tail(fc$yhat, h)
acc = smape(pred, train.valid$y)
cv.error = rbind(cv.error, acc)
}
return(cv.error)
}
# Adding total sales column to data and prepare for prophet
train.prophet <- train_wide[,-2]
Tot_Sales <- rowSums(train.prophet[,-c(1,2)])
holidays <- train.prophet[,c(1,2)]
train.prophet <- cbind(train.prophet[,1],Tot_Sales)
# Run CV
cv.acc = prophet.CV(y = train.prophet, window = 1650, h = 20)
## [1] "Done 1"
## [1] "Done 2"
## [1] "Done 3"
## [1] "Done 4"
## [1] "Done 5"
## [1] "Done 6"
## [1] "Done 7"
## [1] "Done 8"
## [1] "Done 9"
## [1] "Done 10"
CV Errors
# Mean SMAPE
mean(cv.acc, na.rm = T)
## [1] 0.04085337
# Standard Deviation of SMAPE
sd(cv.acc, na.rm = T)
## [1] 0.002066195